Goals

  1. List the top hits of genes that comutate with KRAS in CRC (non-allele-specific).
  2. Compare expression levels of DUSP genes between KRAS alleles in CRC.

Comutation interaction list

Below is a list of the top comutating genes with KRAS in CRC. The last four columns have the number of tunor samples with the various combination of mutations; for example, G mut & K WT has the number of tumors with the other gene (G) mutated and KRAS (K) WT.

nonallele_specific_increased_comutation_df %.% {
  filter(cancer == "COAD" & hugo_symbol != "KRAS")
  filter(p_value < 0.01)
  arrange(p_value)
  mutate(
    p_value = scales::scientific(p_value, digits = 3),
    odds_ratio = round(odds_ratio, digits = 3),
    geneWT_krasWT = map_dbl(comut_ct_tbl, ~ .x[1, 1]),
    geneMut_krasWT = map_dbl(comut_ct_tbl, ~ .x[2, 1]),
    geneWT_krasMut = map_dbl(comut_ct_tbl, ~ .x[1, 2]),
    geneMut_krasMut = map_dbl(comut_ct_tbl, ~ .x[2, 2]),
  )
  select(hugo_symbol, p_value, odds_ratio, tidyselect::starts_with("gene"))
  rename(
    gene = hugo_symbol,
    `p-value` = p_value,
    `odds ratio` = odds_ratio,
    `G WT & K WT` = geneWT_krasWT,
    `G mut & K WT` = geneMut_krasWT,
    `G WT & K mut` = geneWT_krasMut,
    `G mut & K mut` = geneMut_krasMut
    )
}

Differential expression of DUSP genes

The data used for this analysis was the RNA expression data from TCGA-COAD. This data set has RNA expression for 525 tumor samples. 78 of these samples are hypermutants; these samples were removed from the analysis.

main_alleles <- c("WT", "KRAS_G12A", "KRAS_G12C", "KRAS_G12D", "KRAS_G12V", "KRAS_G13D")

dusp_rna_data <- tcga_coad_rna %.% {
  filter(str_detect(hugo_symbol, "DUSP"))
  filter(!is_hypermutant)
  select(-is_hypermutant)
  mutate(
    allele = ifelse(ras_allele %in% !!main_alleles, ras_allele, "Other"),
    allele = str_remove(allele, "KRAS_"),
    allele = factor_alleles(allele)
  )
}

# Put DUSP genes in order.
dusp_order <- unique(dusp_rna_data$hugo_symbol)
dusp_num <- as.numeric(str_remove_all(dusp_order, "[:alpha:]"))
dusp_order <- dusp_order[order(dusp_num)]
dusp_rna_data$hugo_symbol <- factor(dusp_rna_data$hugo_symbol, levels = dusp_order)

Below is a sample of the RNA expression data table.

dusp_rna_data %>%
  rename(
    `DUSP` = hugo_symbol,
    `tumor sample` = tumor_sample_barcode,
    `RNA (RSEM)` = rna_expr
  ) %>%
  select(-ras_allele) %>%
  head() %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
DUSP tumor sample RNA (RSEM) allele
DUSP10 TCGA-3L-AA1B 254.352 WT
DUSP10 TCGA-4N-A93T 133.527 G12D
DUSP10 TCGA-4T-AA8H 275.635 G12V
DUSP10 TCGA-5M-AAT4 507.745 G12D
DUSP10 TCGA-5M-AAT5 572.762 G12D
DUSP10 TCGA-5M-AATA 209.145 WT

The number of tumor samples with missing data for each DUSP gene.

dusp_rna_data %>%
  filter(is.na(rna_expr)) %>%
  count(hugo_symbol, sort = TRUE) %>%
  pivot_wider(names_from = hugo_symbol, values_from = n) %>%
  kbl() %>%
  kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = FALSE,
    position = "left"
  )
DUSP13 DUSP21
149 149

The number of tumor samples with 0 RNA expression values for each DUSP gene.

dusp_rna_data %>%
  filter(rna_expr <= 0) %>%
  count(hugo_symbol, sort = TRUE) %>%
  pivot_wider(names_from = hugo_symbol, values_from = n) %>%
  kbl() %>%
  kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = FALSE,
    position = "left"
  )
DUSP21 DUSP13 DUSP5P DUSP26 DUSP9 DUSP27 DUSP15
284 217 73 58 34 11 2

All negative RNA expression values were set to 0.

dusp_rna_data %<>% mutate(rna_expr = map_dbl(rna_expr, ~ max(0, .x)))

Inspect the distribution of RNA expression values

dusp_rna_data %>%
  filter(!is.na(rna_expr)) %>%
  mutate(rna_expr = rna_expr + 1) %>%
  ggplot(aes(x = allele, y = rna_expr)) +
  facet_wrap(~hugo_symbol, scales = "free_y") +
  geom_boxplot(outlier.shape = NA) +
  scale_y_continuous(trans = "log10") +
  theme(
    panel.grid.major.x = element_blank(),
    axis.text = element_text(size = 7),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  labs(x = NULL, y = "RNA expression (log10 + 1)")

DUSP13 and DUSP21 were removed from the analysis because they were missing data and had very low expression levels.

IGNORE_DUSPS <- c("DUSP13", "DUSP21")
dusp_rna_data %<>% filter(!hugo_symbol %in% IGNORE_DUSPS)
plot_dusp_distribtions <- function(df, x) {
  df %>%
    ggplot(aes(x = {{ x }})) +
    facet_wrap(~hugo_symbol, scales = "free") +
    scale_y_continuous(expand = expansion(c(0, 0.02))) +
    geom_density() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold"),
      axis.text = element_text(size = 6),
      axis.text.x = element_text(angle = 30, hjust = 1),
      strip.text = element_text(size = 7),
      panel.spacing = unit(1, "mm")
    ) +
    labs(x = "RNA expression")
}

plot_dusp_distribtions(dusp_rna_data, rna_expr) +
  ggtitle("Non-transformed RNA expression values")

dusp_rna_data %>%
  mutate(rna_expr = log10(rna_expr + 1)) %>%
  plot_dusp_distribtions(rna_expr) +
  ggtitle("log10(RNA + 1) transformed data")

dusp_rna_data %>%
  mutate(rna_expr = sqrt(rna_expr)) %>%
  plot_dusp_distribtions(rna_expr) +
  ggtitle("sqrt(RNA) transformed data")

dusp_rna_data %>%
  group_by(hugo_symbol) %>%
  mutate(rna_expr = scale_numeric(rna_expr)) %>%
  ungroup() %>%
  plot_dusp_distribtions(rna_expr) +
  ggtitle("Centralized and scaled (z-scale) data")

dusp_rna_data %>%
  mutate(rna_expr = sqrt(rna_expr)) %>%
  group_by(hugo_symbol) %>%
  mutate(rna_expr = scale_numeric(rna_expr)) %>%
  ungroup() %>%
  plot_dusp_distribtions(rna_expr) +
  ggtitle("sqrt(RNA) & z-scaled data")

The \(\log_{10}(\text{RNA} + 1)\), centralized, and scaled values will be used for the analysis.

dusp_rna_data %<>%
  mutate(log10_rna_expr = log10(rna_expr + 1)) %>%
  group_by(hugo_symbol) %>%
  mutate(log10_z_rna = scale_numeric(log10_rna_expr)) %>%
  ungroup()

Check for correlations in DUSP expression

dusp_corr <- dusp_rna_data %>%
  pivot_wider(id = tumor_sample_barcode, names_from = hugo_symbol, values_from = log10_z_rna) %>%
  select(-tumor_sample_barcode) %>%
  corrr::correlate()

dusp_corr_pheat <- dusp_corr %>%
  as.data.frame() %>%
  column_to_rownames("rowname")
colnames(dusp_corr_pheat) <- str_remove(colnames(dusp_corr_pheat), "DUSP")
rownames(dusp_corr_pheat) <- str_remove(rownames(dusp_corr_pheat), "DUSP")
pheatmap::pheatmap(
  dusp_corr_pheat,
  border_color = NA,
  na_col = "white",
  main = "Correlation of DUSP gene expression",
)

corrr::network_plot(dusp_corr, min_cor = 0.3) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "DUSP gene expression correlation network")

Modeling

new_allele_order <- as.character(sort(unique(dusp_rna_data$allele)))
new_allele_order <- c("WT", new_allele_order[new_allele_order != "WT"])

data <- dusp_rna_data %>%
  mutate(
    grp_allele = case_when(
      allele == "G13D" ~ "G13D",
      allele == "WT" ~ "WT",
      TRUE ~ "KRAS"
    ),
    grp_allele = factor(grp_allele, levels = c("WT", "G13D", "KRAS")),
    allele = factor(as.character(allele), levels = new_allele_order)
  )

# FOR TESTING
if (FALSE) {
  set.seed(0)
  TESTING_DUSPS <- paste0("DUSP", 1:6)
  TESTING_TSBS <- sample(unique(data$tumor_sample_barcode), 100)
  data <- data %.% {
    filter(hugo_symbol %in% TESTING_DUSPS)
    filter(tumor_sample_barcode %in% TESTING_TSBS)
  }
}
stash("lm_stan_hier", depends_on = "data", {
  lm_stan_hier <- stan_glmer(
    log10_z_rna ~ 1 + (1 + allele | hugo_symbol),
    data = data,
    prior = normal(location = 0, scale = 5),
    prior_intercept = normal(location = 0, scale = 2),
    prior_aux = exponential(rate = 1),
    prior_covariance = decov(regularization = 1, concentration = 1, shape = 1, scale = 1)
  )
})
#> Loading stashed object.

Trace plots for the global intercept and standard deviation \(\sigma\).

mcmc_trace(lm_stan_hier, pars = "(Intercept)") /
  mcmc_trace(lm_stan_hier, pars = "sigma")

Trace plots for parameters for DUSP1.

mcmc_trace(lm_stan_hier, regex_pars = "DUSP1\\]") +
  scale_x_continuous(expand = c(0, 0))
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.

pp_check(lm_stan_hier, plotfun = "stat", stat = "mean") + 
  ggtitle("Distrbition of error of the posterior predictions")

pp_check(lm_stan_hier, plotfun = "stat_2d", stat = c("mean", "sd")) +
  ggtitle("Standard deviation and mean of the error of the posterior predictions")

lm_dusp_post <- as.data.frame(lm_stan_hier) %.% {
  mutate(draw = row_number())
  select(draw, `(Intercept)`, tidyselect::contains("DUSP"))
  pivot_longer(
    -c(draw, `(Intercept)`),
    names_to = "parameter",
    values_to = "value"
  )
  mutate(
    parameter = str_remove_all(parameter, "[:punct:]"),
    parameter = str_remove_all(parameter, "b|allele|hugosymbol|")
  )
  separate(parameter, c("allele", "dusp"), sep = " ")
  mutate(allele = str_replace(allele, "Intercept", "WT"))
}

lm_dusp_post %>%
  ggplot(aes(x = value)) +
  facet_wrap(~dusp, ncol = 4, scales = "free") +
  geom_vline(
    xintercept = 0,
    lty = 2,
    color = "grey50",
    size = 0.9
  ) +
  geom_density(aes(color = allele), size = 1) +
  scale_color_manual(values = short_allele_pal) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = expansion(c(0, 0.02))) +
  labs(
    x = "posterior distribution",
    y = "probability density",
    color = "KRAS allele"
  )

lm_dusp_post %.%
  {
    group_by(dusp, allele)
    summarise(
      avg = median(value),
      hdi_50 = list(unlist(ggdist::hdi(value, .width = 0.50))),
      hdi_89 = list(unlist(ggdist::hdi(value, .width = 0.89))),
    )
    ungroup()
    mutate(
      hdi_50_lower = map_dbl(hdi_50, ~ .x[[1]]),
      hdi_50_upper = map_dbl(hdi_50, ~ .x[[2]]),
      hdi_89_lower = map_dbl(hdi_89, ~ .x[[1]]),
      hdi_89_upper = map_dbl(hdi_89, ~ .x[[2]])
    )
    select(-hdi_50, -hdi_89)
  } %>%
  ggplot(aes(x = avg, y = allele, color = allele)) +
  facet_wrap(~dusp, scales = "free_x") +
  geom_rect(
    xmin = -0.1, xmax = 0.1, ymin = Inf, ymax = -Inf,
    fill = "grey80",
    color = NA,
    alpha = 0.1
  ) +
  geom_vline(xintercept = 0, color = "grey50") +
  geom_point(size = 2) +
  geom_linerange(
    aes(xmin = hdi_50_lower, xmax = hdi_50_upper),
    size = 1.2,
    alpha = 0.8
  ) +
  geom_linerange(
    aes(xmin = hdi_89_lower, xmax = hdi_89_upper),
    size = 0.9,
    alpha = 0.5
  ) +
  scale_color_manual(values = short_allele_pal, drop = TRUE) +
  theme(legend.position = "none") +
  labs(
    x = "posterior distributions",
    y = NULL
  )

bayestestR::describe_posterior(lm_stan_hier, effects = "all") %>%
  arrange(Effects) %>%
  as_tibble() %>%
  mutate(
    Parameter = str_remove_all(Parameter, "[:punct:]"),
    Parameter = str_remove_all(Parameter, "^b|allele|hugosymbol"),
  ) %>%
  mutate_if(is.numeric, round, digits = 2)
#> Possible multicollinearity between Sigma[hugo_symbol:alleleG12D,(Intercept)] and Sigma[hugo_symbol:(Intercept),(Intercept)] (r = 0.79), Sigma[hugo_symbol:alleleG12V,(Intercept)] and Sigma[hugo_symbol:(Intercept),(Intercept)] (r = 0.71), Sigma[hugo_symbol:alleleG12D,alleleG12D] and Sigma[hugo_symbol:alleleG12D,(Intercept)] (r = 0.77), Sigma[hugo_symbol:alleleG12V,alleleG12V] and Sigma[hugo_symbol:alleleG12V,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleG13D,alleleG12D] and Sigma[hugo_symbol:alleleG13D,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleG13D,alleleG13D] and Sigma[hugo_symbol:alleleG13D,(Intercept)] (r = 0.72), Sigma[hugo_symbol:alleleOther,alleleG12D] and Sigma[hugo_symbol:alleleOther,(Intercept)] (r = 0.74). This might lead to inappropriate results. See 'Details' in '?rope'.